import jsonlines

from datasets import load_dataset
from tqdm import tqdm
raw_dataset = load_dataset("/share/project/yuyang/data_pro/data_version/gemma2-ultrafeedback-armorm",split="train")

contexts = set()
#for d in tqdm(raw_dataset):
#    contexts.add(d["chosen"][0]["content"])


with jsonlines.open("ultrafeedback_label.jsonl") as f:
    for line in tqdm(f):
        if len(line["label"]["ability"])==0:
            continue
        contexts.add(line["chosen"][0]["content"])
        #if line["chosen"][0]["content"] not in contexts:
        #    datas.append(line)

datas = []
for d in raw_dataset:
    if d["chosen"][0]["content"] not in contexts:
        datas.append(d)

with jsonlines.open("ultrafeedback_label_filter.jsonl","w") as wf:
    for line in datas:
        wf.write(line)



